In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('xtick', labelsize=14)
matplotlib.rc('ytick', labelsize=14)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
from pandas.tseries.offsets import *
import simulated_data
from graphviz import Source
In [2]:
cut = 0.55
window = 24
In [3]:
# df = simulated_data.get_simulated_data()
df = simulated_data.get_simulated_fixed_data()
df.head()
Out[3]:
In [4]:
ax = df.plot(figsize=(20,7))
ax.set_xlabel("time", fontsize=16)
plt.savefig('simulated_fixed.png')
In [5]:
def check_for_anomaly(ref, sub):
y_ref = pd.Series([0] * ref.shape[0])
X_ref = ref
del X_ref['flag']
del X_ref['auc_score']
y_sub = pd.Series([1] * sub.shape[0])
X_sub=sub
del X_sub['flag']
del X_sub['auc_score']
# separate Reference and Subject into Train and Test
X_ref_train, X_ref_test, y_ref_train, y_ref_test = train_test_split(X_ref, y_ref, test_size=0.3, random_state=42)
X_sub_train, X_sub_test, y_sub_train, y_sub_test = train_test_split(X_sub, y_sub, test_size=0.3, random_state=42)
# combine training ref and sub samples
X_train = pd.concat([X_ref_train, X_sub_train])
y_train = pd.concat([y_ref_train, y_sub_train])
# combine testing ref and sub samples
X_test = pd.concat([X_ref_test, X_sub_test])
y_test = pd.concat([y_ref_test, y_sub_test])
clf = AdaBoostClassifier() #dtc
# clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME",n_estimators=200)
#train an AdaBoost model to be able to tell the difference between the reference and subject data
clf.fit(X_train, y_train)
#Predict using the combined test data
y_predict = clf.predict(X_test)
# scores = cross_val_score(clf, X, y)
# print(scores)
fpr, tpr, thresholds = roc_curve(y_test, y_predict) # calculate the false positive rate and true positive rate
auc_score = auc(fpr, tpr) #calculate the AUC score
print ("auc_score = ", auc_score, "\tfeature importances:", clf.feature_importances_)
if auc_score > cut:
plot_roc(fpr, tpr, auc_score)
filename='tree_'+sub.index.min().strftime("%Y-%m-%d_%H")
tree.export_graphviz(clf.estimators_[0] , out_file=filename +'_1.dot')
tree.export_graphviz(clf.estimators_[1] , out_file=filename +'_2.dot')
return auc_score
def plot_roc(fpr,tpr, roc_auc):
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.plot([0, 1], [0, 1], linestyle='--', color='r',label='Luck', alpha=.8)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
In [6]:
df['auc_score']=0.5
#find min and max timestamps
start = df.index.min()
end = df.index.max()
#round start
start.seconds=0
start.minutes=0
ref = window * Hour()
sub = 1 * Hour()
# loop over them
ti=start+ref+sub
count=0
while ti < end + 1 * Minute():
ref_start = ti-ref-sub
ref_end = ti-sub
ref_df = df[(df.index >= ref_start) & (df.index < ref_end)]
sub_df = df[(df.index >= ref_end) & (df.index < ti)]
auc_score = check_for_anomaly(ref_df, sub_df)
df.loc[(df.index>=ref_end) & (df.index<=ti),['auc_score']] = auc_score
print(ti,"\trefes:" , ref_df.shape[0], "\tsubjects:", sub_df.shape[0], '\tauc:', auc_score)
ti = ti + sub
count=count+1
#if count>2: break
In [10]:
ax = df.plot(figsize=(20,7))
ax.set_xlabel("time", fontsize=14)
plt.savefig('BDT_simulated_fixed.png')
In [5]:
fig, ax = plt.subplots(figsize=(20,7))
ax.set_xlabel("time", fontsize=14)
df.loc[:,'Detected'] = 0
df.loc[df.auc_score>0.55,'Detected']=1
df.head()
ax.plot(df.flag, 'r')
ax.plot(df.auc_score,'g')
ax.fill( df.Detected, 'b', alpha=0.3)
ax.legend(loc='upper left')
plt.show()
fig.savefig('BDT_shaded_simulated_fixed.png')
In [ ]: